
# Description ----------------------------------------------------------------------------
#   Build main analysis dataset of cities/zip codes common to both utilities.
#   Note: Builds upon 'analyzeCommon.R'

# Setup ----------------------------------------------------------------------------------
# Options
options(stringsAsFactors = FALSE)
# Packages
library(pacman)
p_load(fastverse, readr, data.table, qs, magrittr, here)
fastverse_extend(topics = c('IO', 'DT', 'ST'))
# Project directory
# dir_project = "S:/Edward/Elasticity/"
dir_project = "T:/Home/Edward/Elasticity"
dir_csv     = file.path(dir_project, "DataCsv")
dir_rds     = file.path(dir_project, "DataR")
dir_figures = file.path(dir_project, "Figures")
dir_output  = file.path(dir_project, "Output")
# Collapse package issues
F = FALSE

# Load data: The common zip code panel ---------------------------------------------------
# Load the common zip code data
bill_dt = file.path(
  dir_rds,
  # "Prices/Joint/commonCitiesZipsBills01PercentSample.rds"
  # "Prices/Joint/commonCitiesZipsBills10PercentSample.rds"
  "Prices/Joint/commonCitiesZipsBills.rds"
) %>% read_rds()
# # Load the neighbor panel (starts at 0 for 'common zips')
# bill_dt <- lapply(
#   X = 0:0,
#   # X = 0:1,
#   # X = 0:2,
#   # X = 0:3,
#   FUN = function(g) {
#     file.path(
#       dir_rds,
#       "Prices/Joint",
#       paste0(
#         "zipsNeighbor",
#         g,
#         # "_01PercentSample.rds") %>% readRDS()
#         # "_10PercentSample.rds") %>% readRDS()
#         ".rds"
#       )
#     ) %>% readRDS()
#   }) %>% rbindlist()
# Create lags and leads of flag_thm and flag_rev
setorder(bill_dt, hh_id, begin_date)
bill_dt[, `:=`(
  flag_rev_lead3 = shift(flag_rev,  3L, fill = NA, type = "lead"),
  flag_thm_lead3 = shift(flag_thm,  3L, fill = NA, type = "lead"),
  flag_rev_lead2 = shift(flag_rev,  2L, fill = NA, type = "lead"),
  flag_thm_lead2 = shift(flag_thm,  2L, fill = NA, type = "lead"),
  flag_rev_lead1 = shift(flag_rev,  1L, fill = NA, type = "lead"),
  flag_thm_lead1 = shift(flag_thm,  1L, fill = NA, type = "lead"),
  flag_rev_lag1  = shift(flag_rev,  1L, fill = NA, type = "lag"),
  flag_thm_lag1  = shift(flag_thm,  1L, fill = NA, type = "lag"),
  flag_rev_lag2  = shift(flag_rev,  2L, fill = NA, type = "lag"),
  flag_thm_lag2  = shift(flag_thm,  2L, fill = NA, type = "lag"),
  flag_rev_lag3  = shift(flag_rev,  3L, fill = NA, type = "lag"),
  flag_thm_lag3  = shift(flag_thm,  3L, fill = NA, type = "lag"),
  flag_rev_lag4  = shift(flag_rev,  3L, fill = NA, type = "lag"),
  flag_thm_lag4  = shift(flag_thm,  3L, fill = NA, type = "lag")
), by = hh_id]
# Save dataset
write_fst(
  x = bill_dt,
  path = file.path(dir_rds, 'ForAnalysis', 'full-cities.fst')
)

# Enforce more conservative discontinuity at zip code ------------------------------------
  # Count observations by zip code
  zip_dt = bill_dt[, .(n = .N), by = .(zip, utility)]
  zip_dt %<>% dcast(zip ~ utility)
  common_dt = zip_dt[!(is.na(pge) | is.na(socal))]
  # Drop observations from zip codes not covered by both utilities
  sub_dt = bill_dt[zip %in% common_dt$zip]
  # Drop the full dataset (will not use for estimation)
  rm(bill_dt); gc()
  
# Drop abnormal bills --------------------------------------------------------------------
sub_dt = sub_dt[(
  # Keep bills with 28-34 days
  between(days, 28, 34) &
  # between(days_lag1, 28, 34) &
  # between(days_lag2, 28, 34) &
  # between(days_lag3, 28, 34) &
  # Drop observations flagged due to problems re-calculating bills
# NOTE: I flagged bills with absolute percent error of 5% or more.
  thm > 0 & flag_thm == 0 & bill_flag == FALSE & flag_rev == 0 &
  thm_lag1 > 0 & flag_thm_lag1 == 0 & bill_flag_lag1 == FALSE & flag_rev_lag1 == 0 &
  thm_lag2 > 0 & flag_thm_lag2 == 0 & bill_flag_lag2 == FALSE & flag_rev_lag2 == 0 &
  # thm_lag3 > 0 & flag_thm_lag3 == 0 & bill_flag_lag3 == FALSE & flag_rev_lag3 == 0 &
  TRUE
)]
# Save dataset
write_fst(
  x = sub_dt,
  path = file.path(dir_rds, 'ForAnalysis', 'sub-zips.fst')
)
